Wykorzystane biblioteki

library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(knitr)
library(caret)
library(data.table)
library(tibble)

Wczytywanie danych

removable_columns <- c("title", "pdb_code", "res_id", "chain_id", "local_res_atom_count", "local_res_atom_non_h_occupancy_sum", "local_res_atom_non_h_electron_occupancy_sum", "local_res_atom_C_count", "local_res_atom_N_count", "local_res_atom_O_count", "local_res_atom_S_count", "dict_atom_C_count", "dict_atom_N_count", "dict_atom_O_count", "dict_atom_S_count", "skeleton_data", "skeleton_cycle_4", "skeleton_diameter", "skeleton_cycle_6", "skeleton_cycle_7", "skeleton_closeness_006_008", "skeleton_closeness_002_004", "skeleton_cycle_3", "skeleton_avg_degree", "skeleton_closeness_004_006", "skeleton_closeness_010_012", "skeleton_closeness_012_014", "skeleton_edges", "skeleton_radius", "skeleton_cycle_8_plus", "skeleton_closeness_020_030", "skeleton_deg_5_plus", "skeleton_closeness_016_018", "skeleton_closeness_008_010", "skeleton_closeness_018_020", "skeleton_average_clustering", "skeleton_closeness_040_050", "skeleton_closeness_014_016", "skeleton_center", "skeleton_closeness_000_002", "skeleton_density", "skeleton_closeness_030_040", "skeleton_deg_4", "skeleton_deg_0", "skeleton_deg_1", "skeleton_deg_2", "skeleton_deg_3", "skeleton_graph_clique_number", "skeleton_nodes", "skeleton_cycles", "skeleton_cycle_5", "skeleton_closeness_050_plus", "skeleton_periphery", "fo_col", "fc_col", "weight_col", "grid_space", "solvent_radius", "solvent_opening_radius", "part_step_FoFc_std_min", "part_step_FoFc_std_max", "part_step_FoFc_std_step")

data <- fread("./all_summary.csv", nrows = 10000, header = TRUE, drop = removable_columns)
dim(data)
## [1] 10000   350

Przetwarzanie brakujących danych

dim(data)
## [1] 10000   350
data <- data %>% 
  drop_na()
dim(data)
## [1] 8958  350

Usuwanie niepotrzebnych ligandów

deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data %>% filter(!res_name %in% deletable_res_name)
dim(data)
## [1] 8910  350

Podsumowanie danych

statistics <- data %>%
  select(res_name, blob_volume_coverage, blob_volume_coverage_second)
kable(summary(statistics))
res_name blob_volume_coverage blob_volume_coverage_second
Length:8910 Min. :0.02305 Min. :0.00000
Class :character 1st Qu.:0.50648 1st Qu.:0.00000
Mode :character Median :0.72244 Median :0.00000
NA Mean :0.66784 Mean :0.02067
NA 3rd Qu.:0.86480 3rd Qu.:0.00000
NA Max. :1.00000 Max. :0.95385
dim(data)
## [1] 8910  350

50 najpopularniejszych ligandów

popular_ligands <- data %>%
  select(res_name) %>%
  count(res_name, sort = TRUE) %>%
  slice(1:50)

popular_names_vector <- popular_ligands %>%
  pull(res_name)

data <- data %>% filter(res_name %in% popular_names_vector)
dim(data)
## [1] 6239  350

Liczność najpopularniejszych ligandów według nazwy

plot_ligands <- ggplot(popular_ligands, aes(x = reorder(res_name, -n), y = n, fill = n)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("ligand")+
  ylab("liczność") +
  labs(title = "Liczność ligandów według nazwy")

ggplotly(plot_ligands)

Korelacja między zmiennymi

data %>%
  select_if(is.numeric) %>%
  cor() %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  gather(rowname2, value, -rowname) %>%
  filter(value >= 0.9999, rowname != rowname2)
##                           rowname                       rowname2     value
## 1                     part_00_max                      local_max 1.0000000
## 2                     part_01_max                      local_max 1.0000000
## 3                     part_02_max                      local_max 1.0000000
## 4            part_00_max_over_std             local_max_over_std 1.0000000
## 5            part_01_max_over_std             local_max_over_std 1.0000000
## 6            part_02_max_over_std             local_max_over_std 0.9999999
## 7  part_00_density_segments_count   part_00_shape_segments_count 1.0000000
## 8    part_00_shape_segments_count part_00_density_segments_count 1.0000000
## 9              part_00_shape_M000                 part_00_volume 1.0000000
## 10           part_00_density_M000              part_00_electrons 1.0000000
## 11                      local_max                    part_00_max 1.0000000
## 12                    part_01_max                    part_00_max 1.0000000
## 13                    part_02_max                    part_00_max 1.0000000
## 14             local_max_over_std           part_00_max_over_std 1.0000000
## 15           part_01_max_over_std           part_00_max_over_std 1.0000000
## 16           part_02_max_over_std           part_00_max_over_std 0.9999999
## 17                 part_00_volume             part_00_shape_M000 1.0000000
## 18             part_01_density_FL             part_00_density_FL 0.9999229
## 19             part_01_density_I4             part_00_density_I4 0.9999031
## 20              part_00_electrons           part_00_density_M000 1.0000000
## 21 part_01_density_segments_count   part_01_shape_segments_count 1.0000000
## 22   part_01_shape_segments_count part_01_density_segments_count 1.0000000
## 23             part_01_shape_M000                 part_01_volume 1.0000000
## 24           part_01_density_M000              part_01_electrons 1.0000000
## 25                      local_max                    part_01_max 1.0000000
## 26                    part_00_max                    part_01_max 1.0000000
## 27                    part_02_max                    part_01_max 1.0000000
## 28             local_max_over_std           part_01_max_over_std 1.0000000
## 29           part_00_max_over_std           part_01_max_over_std 1.0000000
## 30           part_02_max_over_std           part_01_max_over_std 0.9999999
## 31                 part_01_volume             part_01_shape_M000 1.0000000
## 32             part_00_density_FL             part_01_density_FL 0.9999229
## 33             part_00_density_I4             part_01_density_I4 0.9999031
## 34              part_01_electrons           part_01_density_M000 1.0000000
## 35 part_02_density_segments_count   part_02_shape_segments_count 1.0000000
## 36   part_02_shape_segments_count part_02_density_segments_count 1.0000000
## 37             part_02_shape_M000                 part_02_volume 1.0000000
## 38           part_02_density_M000              part_02_electrons 1.0000000
## 39                      local_max                    part_02_max 1.0000000
## 40                    part_00_max                    part_02_max 1.0000000
## 41                    part_01_max                    part_02_max 1.0000000
## 42             local_max_over_std           part_02_max_over_std 0.9999999
## 43           part_00_max_over_std           part_02_max_over_std 0.9999999
## 44           part_01_max_over_std           part_02_max_over_std 0.9999999
## 45                 part_02_volume             part_02_shape_M000 1.0000000
## 46              part_02_electrons           part_02_density_M000 1.0000000

Rozkłady gęstościowe liczb

Atomów

plot_atom <- ggplot(data, aes(x = local_res_atom_non_h_count)) +
  geom_density(alpha = .3, fill = "#00CECB", color = NA) +
  xlab("liczność atomów") +
  ylab("gęstość") +
  labs(title = "Rozkład gęstościowy atomów")

ggplotly(plot_atom)

Elektronów

plot_electron <- ggplot(data, aes(x = local_res_atom_non_h_electron_sum)) +
  geom_density(alpha = .3, fill = "#FF5E5B", color = NA) +
  xlab("liczność elektronów") +
  ylab("gęstość") +
  labs(title = "Rozkład gęstościowy elektronów")

ggplotly(plot_electron)

Rozkład wartości kolumn part_01

remove_outliers <- function(data, na.rm = TRUE, ...) {
  qnt <- quantile(data, probs=c(.25, .75), na.rm = na.rm, ...)
  iqr <- 1.5 * IQR(data, na.rm = na.rm)
  data_no_outliers <- data
  data_no_outliers[data < (qnt[1] - iqr)] <- NA
  data_no_outliers[data > (qnt[2] + iqr)] <- NA
  data_no_outliers[!is.na(data_no_outliers)]
  data_no_outliers
}

plot_part_data <- data %>%
  select(contains("part_01"))
dim(plot_part_data)
## [1] 6239  106
plot_part_data <- plot_part_data %>%
  sapply(remove_outliers) %>%
  as.data.frame()
dim(plot_part_data)
## [1] 6239  106
plot_part_data <- plot_part_data %>%
  drop_na()
dim(plot_part_data)
## [1] 3028  106
plot_part_data <- plot_part_data %>%
  gather(part, value, 1:106)

plot_ly(plot_part_data, x = plot_part_data$value, y = plot_part_data$part, type = 'box', height = 2000)

Największe niezgodności liczby

Atomów

data %>%
  select(res_name, local_res_atom_non_h_count, dict_atom_non_h_count) %>%
  group_by(res_name) %>%
  summarise(atom_inconsistency = mean(abs(local_res_atom_non_h_count - dict_atom_non_h_count))) %>%
  arrange(-atom_inconsistency) %>%
  slice(1:10) %>%
  kable()
res_name atom_inconsistency
PLC 17.1481481
LHG 4.4615385
C8E 2.6428571
NDP 1.7333333
NAP 1.5090909
PG4 1.4225352
MLY 1.2222222
CME 1.0000000
MAN 1.0000000
NAG 0.9949495

Elektronów

data %>%
  select(res_name, local_res_atom_non_h_electron_sum, dict_atom_non_h_electron_sum) %>%
  group_by(res_name) %>%
  summarise(electron_inconsistency = mean(abs(local_res_atom_non_h_electron_sum - dict_atom_non_h_electron_sum))) %>%
  arrange(-electron_inconsistency) %>%
  slice(1:10) %>%
  kable()
res_name electron_inconsistency
PLC 114.444444
LHG 34.096154
C8E 16.714286
NDP 11.333333
NAP 10.654545
PG4 9.633803
MLY 9.370370
CME 8.000000
MAN 8.000000
NAG 7.959596

Regresja liniowa

Liczba atomów

data_partition <- data %>%
  select_if(is.numeric)

set.seed(111)
partition <- createDataPartition(
  y = data_partition$local_res_atom_non_h_count,
  p = .7,
  list = FALSE)

data_train <- data_partition %>%
  slice(partition)
data_test <- data_partition %>%
  slice(-partition)
dim(data_train)
## [1] 4368  347
dim(data_test)
## [1] 1871  347
set.seed(111)
fit <- train(local_res_atom_non_h_count ~ ., data = data_train, method = "lm")
fit
## Linear Regression 
## 
## 4368 samples
##  346 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 4368, 4368, 4368, 4368, 4368, 4368, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE      
##   2.405387  0.9278705  0.1214908
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(111)
prediction <- predict(fit, newdata = data_test)
postResample(pred = prediction, obs = data_test$local_res_atom_non_h_count)
##       RMSE   Rsquared        MAE 
## 0.58514089 0.99823080 0.07083429

Liczba elektronów

data_partition <- data %>%
  select_if(is.numeric)

set.seed(111)
partition <- createDataPartition(
  y = data_partition$local_res_atom_non_h_electron_sum,
  p = .7,
  list = FALSE)

data_train <- data_partition %>%
  slice(partition)
data_test <- data_partition %>%
  slice(-partition)
dim(data_train)
## [1] 4369  347
dim(data_test)
## [1] 1870  347
set.seed(111)
fit <- train(local_res_atom_non_h_electron_sum ~ ., data = data_train, method = "lm")
fit
## Linear Regression 
## 
## 4369 samples
##  346 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 4369, 4369, 4369, 4369, 4369, 4369, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   33.47963  0.8446019  1.274699
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(111)
prediction <- predict(fit, newdata = data_test)
postResample(pred = prediction, obs = data_test$local_res_atom_non_h_electron_sum)
##      RMSE  Rsquared       MAE 
## 2.6911609 0.9991266 0.4644615

Klasyfikator

Przewidywanie wartości res_name

dim(data)
## [1] 6239  350
removable_columns <- c("blob_coverage", "res_coverage", "local_res_atom_non_h_count", "local_res_atom_non_h_electron_sum", "dict_atom_non_h_count", "dict_atom_non_h_electron_sum")
data_partition <- data %>%
  select(-removable_columns)

dim(data_partition)
## [1] 6239  344
data_partition$res_name <- as.factor(data_partition$res_name)

set.seed(111)
partition <- createDataPartition(
  y = data_partition$res_name,
  p = .7,
  list = FALSE)

data_train <- data_partition %>%
  slice(partition)
data_test <- data_partition %>%
  slice(-partition)
dim(data_train)
## [1] 4391  344
dim(data_test)
## [1] 1848  344
set.seed(111)
fit <- train(
  res_name ~ .,
  data = data_train,
  method = "rf",
  ntree = 10,
  na.action  = na.pass)
fit
## Random Forest 
## 
## 4391 samples
##  343 predictor
##   50 classes: 'ACT', 'ACY', 'ADP', 'AMP', 'BR', 'C8E', 'CA', 'CD', 'CIT', 'CL', 'CME', 'CU', 'CYC', 'DMS', 'EDO', 'EPE', 'FAD', 'FE', 'FE2', 'FEC', 'FES', 'FMN', 'FMT', 'GLC', 'GOL', 'H4B', 'HEC', 'HEM', 'IOD', 'K', 'LHG', 'MAN', 'MES', 'MG', 'MLY', 'MN', 'MPD', 'NAD', 'NAG', 'NAP', 'NDP', 'NI', 'PEG', 'PG4', 'PGE', 'PLC', 'PLP', 'PO4', 'SO4', 'ZN' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 4391, 4391, 4391, 4391, 4391, 4391, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##     2   0.3910456  0.3411316
##   172   0.5329791  0.4972496
##   343   0.5309613  0.4952548
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 172.
set.seed(111)
prediction <- predict(fit, newdata = data_test)
confusionMatrix(data = prediction, data_test$res_name)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ACT ACY ADP AMP  BR C8E  CA  CD CIT  CL CME  CU CYC DMS EDO EPE
##        ACT   6   0   0   0   0   0   0   0   0   2   1   0   0   0   5   0
##        ACY   0   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        ADP   0   0   5   0   0   0   0   0   0   0   0   0   0   0   0   0
##        AMP   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0
##        BR    0   0   0   0   4   0   1   0   0   0   0   0   0   0   0   0
##        C8E   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0
##        CA    0   0   0   0   0   0  59   2   0   3   0   1   0   0   0   0
##        CD    0   0   0   0   0   0   1  10   0   0   0   0   0   0   0   0
##        CIT   1   0   0   0   0   0   0   0   2   0   0   0   0   0   0   0
##        CL    0   0   0   0   4   0   7   2   0  89   0   0   0   0   0   0
##        CME   0   0   0   0   0   0   0   0   0   0   4   0   0   0   0   0
##        CU    0   0   0   0   0   0   0   0   0   0   0   3   0   0   0   0
##        CYC   0   0   0   0   0   0   0   0   0   0   0   0   4   0   0   0
##        DMS   1   0   0   0   0   0   0   0   0   0   0   0   0  72   4   1
##        EDO   8   1   0   0   0   1   1   0   2   2   1   1   1   5  76   1
##        EPE   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   4
##        FAD   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FE    0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##        FE2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FEC   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FES   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##        FMN   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   0
##        FMT   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        GLC   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
##        GOL   9   3   2   3   0   2   0   0   1   0   0   0   0   3  28   1
##        H4B   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        HEC   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        HEM   0   0   1   0   0   1   0   0   0   0   0   0   0   0   2   0
##        IOD   0   0   0   0   0   0   3   0   0   2   0   0   0   0   0   0
##        K     0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0
##        LHG   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##        MAN   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        MES   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
##        MG    0   0   0   0   1   0   8   2   0  11   0   0   0   0   1   0
##        MLY   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        MN    0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##        MPD   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
##        NAD   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        NAG   1   0   1   1   0   0   0   0   0   0   1   0   1   0   2   0
##        NAP   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
##        NDP   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        NI    0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##        PEG   0   0   0   0   0   0   0   0   0   0   1   0   0   0   3   0
##        PG4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##        PGE   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##        PLC   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        PLP   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   2
##        PO4   0   0   0   0   0   0   0   0   1   0   0   0   0   0   1   0
##        SO4   8   3   2   5   0   1   0   1   0   0   0   0   1  18  14   3
##        ZN    0   0   0   0   0   0   2   7   0   3   0   4   0   0   0   0
##           Reference
## Prediction FAD  FE FE2 FEC FES FMN FMT GLC GOL H4B HEC HEM IOD   K LHG MAN
##        ACT   0   0   0   0   0   0   2   0   5   0   0   3   0   0   0   0
##        ACY   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        ADP   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##        AMP   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        BR    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        C8E   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        CA    0   0   0   0   0   0   0   0   0   0   0   0   1  12   0   0
##        CD    0   0   1   0   1   0   0   0   0   0   0   0   0   2   0   0
##        CIT   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        CL    0   0   0   0   0   0   0   0   0   0   0   0   5   3   0   0
##        CME   1   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
##        CU    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        CYC   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        DMS   0   0   0   0   0   0   1   0   2   0   0   0   0   0   0   0
##        EDO   2   0   0   1   0   0   3   0  26   0   1   7   1   0   0   0
##        EPE   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FAD  14   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##        FE    0   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FE2   0   0   8   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FEC   0   0   0   4   0   0   0   0   1   0   0   0   0   0   0   0
##        FES   0   0   0   0   5   0   0   0   0   0   0   0   0   0   0   0
##        FMN   1   0   0   0   0  11   0   0   0   0   0   0   0   0   0   0
##        FMT   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        GLC   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        GOL   2   0   0   1   0   2   3   3 101   0   0   6   0   0   3   4
##        H4B   1   0   0   0   0   1   0   0   0   5   0   0   0   0   0   0
##        HEC   0   0   0   0   0   0   0   0   0   0   5   0   0   0   0   0
##        HEM   2   0   0   1   0   0   0   0   1   0   4  42   0   0   0   1
##        IOD   0   0   0   0   0   0   0   0   0   0   0   0  18   0   0   0
##        K     0   0   0   0   0   0   0   0   0   0   0   0   0   4   0   0
##        LHG   0   0   0   0   0   0   0   0   0   0   0   0   0   0  12   0
##        MAN   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3
##        MES   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
##        MG    0   1   0   0   0   0   0   0   0   0   0   0   2   1   0   0
##        MLY   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0   0
##        MN    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        MPD   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##        NAD   1   0   0   0   0   0   0   0   1   1   0   2   0   0   0   0
##        NAG   1   0   0   0   0   0   0   1   1   1   0   1   0   0   0   1
##        NAP   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        NDP   1   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##        NI    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        PEG   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0   0
##        PG4   2   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##        PGE   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        PLC   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        PLP   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0   0
##        PO4   0   0   0   0   1   1   0   0   3   0   0   0   0   0   0   0
##        SO4   4   0   0   0   0   1   0   1  27   0   0   7   1   0   0   0
##        ZN    0   3   3   0   1   0   0   0   0   0   0   0   0   0   0   0
##           Reference
## Prediction MES  MG MLY  MN MPD NAD NAG NAP NDP  NI PEG PG4 PGE PLC PLP PO4
##        ACT   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0
##        ACY   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0
##        ADP   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        AMP   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0
##        BR    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        C8E   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        CA    0  14   0  10   0   0   0   0   0   0   0   0   0   0   0   0
##        CD    0   0   0   1   0   0   0   0   0   2   0   0   0   0   0   0
##        CIT   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
##        CL    0   7   0   0   0   0   0   0   0   1   1   0   0   0   0   0
##        CME   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##        CU    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        CYC   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        DMS   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##        EDO   0   2   1   0   1   0   3   0   0   0   7   1   2   0   0   1
##        EPE   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##        FAD   0   0   0   0   0   3   0   0   0   0   0   0   0   0   1   0
##        FE    0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FE2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FEC   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FES   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FMN   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        FMT   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        GLC   1   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##        GOL   2   1   3   0   4  10  14   3   3   0  15   7   1   0   2   7
##        H4B   0   0   0   0   0   0   2   0   1   0   0   0   0   0   0   0
##        HEC   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        HEM   0   0   0   0   0   1   1   0   0   0   0   0   0   0   0   2
##        IOD   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
##        K     0   1   0   1   0   0   0   0   0   0   0   0   0   0   0   0
##        LHG   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        MAN   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        MES   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        MG    0  28   0   2   0   1   0   0   0   1   0   0   0   0   0   0
##        MLY   0   0   7   0   0   0   0   0   0   0   1   0   0   0   0   0
##        MN    0   1   0   1   0   0   0   0   0   0   0   0   0   0   0   0
##        MPD   0   0   0   0   0   0   1   0   0   0   0   1   0   0   0   0
##        NAD   0   0   0   0   1  17   3   0   3   0   0   0   0   0   1   0
##        NAG   1   0   2   0   0   4  87   0   0   0   2   3   3   0   2   1
##        NAP   0   0   0   0   0   0   0   9   2   0   0   0   0   0   0   0
##        NDP   0   0   0   0   0   0   1   1   2   0   0   0   0   0   0   0
##        NI    0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##        PEG   1   0   0   0   0   1   0   0   0   0   2   2   1   0   0   0
##        PG4   0   0   0   0   0   0   1   0   0   0   2   3   0   0   0   0
##        PGE   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##        PLC   0   0   0   0   0   0   0   0   0   0   0   0   0   8   0   0
##        PLP   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   1
##        PO4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2  10
##        SO4   0   0   2   0   2   1   3   3   1   0   1   2   1   0   1  27
##        ZN    0   1   0   1   0   0   0   0   0   2   0   0   0   0   0   0
##           Reference
## Prediction SO4  ZN
##        ACT   1   0
##        ACY   0   0
##        ADP   0   0
##        AMP   0   0
##        BR    0   0
##        C8E   0   0
##        CA    0   9
##        CD    0   6
##        CIT   0   0
##        CL    0   2
##        CME   1   0
##        CU    0   0
##        CYC   0   0
##        DMS   3   0
##        EDO  16   0
##        EPE   0   0
##        FAD   0   0
##        FE    1   2
##        FE2   0   1
##        FEC   0   0
##        FES   0   0
##        FMN   0   0
##        FMT   1   0
##        GLC   0   0
##        GOL  24   0
##        H4B   0   0
##        HEC   0   0
##        HEM   1   0
##        IOD   0   0
##        K     1   1
##        LHG   0   0
##        MAN   1   0
##        MES   0   0
##        MG    0   6
##        MLY   0   0
##        MN    0   2
##        MPD   0   0
##        NAD   1   0
##        NAG   1   0
##        NAP   0   0
##        NDP   0   0
##        NI    0   1
##        PEG   1   0
##        PG4   0   0
##        PGE   0   0
##        PLC   0   0
##        PLP   0   0
##        PO4  11   0
##        SO4 230   1
##        ZN    0  64
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5703          
##                  95% CI : (0.5474, 0.5931)
##     No Information Rate : 0.1591          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5361          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: ACT Class: ACY Class: ADP Class: AMP Class: BR
## Sensitivity            0.176471   0.363636   0.416667   0.000000  0.444444
## Specificity            0.988975   0.999456   0.999455   0.998910  0.999456
## Pos Pred Value         0.230769   0.800000   0.833333   0.000000  0.800000
## Neg Pred Value         0.984632   0.996202   0.996200   0.992958  0.997287
## Prevalence             0.018398   0.005952   0.006494   0.007035  0.004870
## Detection Rate         0.003247   0.002165   0.002706   0.000000  0.002165
## Detection Prevalence   0.014069   0.002706   0.003247   0.001082  0.002706
## Balanced Accuracy      0.582723   0.681546   0.708061   0.499455  0.721950
##                      Class: C8E Class: CA Class: CD Class: CIT Class: CL
## Sensitivity            0.250000   0.70238  0.384615   0.250000   0.78070
## Specificity            1.000000   0.97052  0.992316   0.998913   0.98155
## Pos Pred Value         1.000000   0.53153  0.416667   0.500000   0.73554
## Neg Pred Value         0.996750   0.98561  0.991228   0.996746   0.98552
## Prevalence             0.004329   0.04545  0.014069   0.004329   0.06169
## Detection Rate         0.001082   0.03193  0.005411   0.001082   0.04816
## Detection Prevalence   0.001082   0.06006  0.012987   0.002165   0.06548
## Balanced Accuracy      0.625000   0.83645  0.688466   0.624457   0.88112
##                      Class: CME Class: CU Class: CYC Class: DMS Class: EDO
## Sensitivity            0.500000  0.333333   0.571429    0.73469    0.54676
## Specificity            0.997826  1.000000   1.000000    0.99200    0.94207
## Pos Pred Value         0.500000  1.000000   1.000000    0.83721    0.43429
## Neg Pred Value         0.997826  0.996748   0.998373    0.98524    0.96234
## Prevalence             0.004329  0.004870   0.003788    0.05303    0.07522
## Detection Rate         0.002165  0.001623   0.002165    0.03896    0.04113
## Detection Prevalence   0.004329  0.001623   0.002165    0.04654    0.09470
## Balanced Accuracy      0.748913  0.666667   0.785714    0.86335    0.74442
##                      Class: EPE Class: FAD Class: FE Class: FE2 Class: FEC
## Sensitivity            0.333333   0.424242  0.555556   0.666667   0.571429
## Specificity            0.999455   0.997245  0.997281   0.999455   0.998914
## Pos Pred Value         0.800000   0.736842  0.500000   0.888889   0.666667
## Neg Pred Value         0.995659   0.989612  0.997824   0.997825   0.998371
## Prevalence             0.006494   0.017857  0.004870   0.006494   0.003788
## Detection Rate         0.002165   0.007576  0.002706   0.004329   0.002165
## Detection Prevalence   0.002706   0.010281  0.005411   0.004870   0.003247
## Balanced Accuracy      0.666394   0.710744  0.776418   0.833061   0.785171
##                      Class: FES Class: FMN Class: FMT Class: GLC
## Sensitivity            0.625000   0.687500  0.0000000   0.000000
## Specificity            0.999457   0.998362  0.9994562   0.997827
## Pos Pred Value         0.833333   0.785714  0.0000000   0.000000
## Neg Pred Value         0.998371   0.997274  0.9951272   0.996204
## Prevalence             0.004329   0.008658  0.0048701   0.003788
## Detection Rate         0.002706   0.005952  0.0000000   0.000000
## Detection Prevalence   0.003247   0.007576  0.0005411   0.002165
## Balanced Accuracy      0.812228   0.842931  0.4997281   0.498914
##                      Class: GOL Class: H4B Class: HEC Class: HEM
## Sensitivity             0.57386   0.625000   0.500000    0.60000
## Specificity             0.89713   0.997283   1.000000    0.98988
## Pos Pred Value          0.36996   0.500000   1.000000    0.70000
## Neg Pred Value          0.95238   0.998368   0.997287    0.98434
## Prevalence              0.09524   0.004329   0.005411    0.03788
## Detection Rate          0.05465   0.002706   0.002706    0.02273
## Detection Prevalence    0.14773   0.005411   0.002706    0.03247
## Balanced Accuracy       0.73550   0.811141   0.750000    0.79494
##                      Class: IOD Class: K Class: LHG Class: MAN Class: MES
## Sensitivity             0.64286 0.181818   0.800000   0.333333   0.285714
## Specificity             0.99670 0.996714   0.999454   0.999456   0.998914
## Pos Pred Value          0.75000 0.400000   0.923077   0.750000   0.500000
## Neg Pred Value          0.99452 0.990207   0.998365   0.996746   0.997289
## Prevalence              0.01515 0.011905   0.008117   0.004870   0.003788
## Detection Rate          0.00974 0.002165   0.006494   0.001623   0.001082
## Detection Prevalence    0.01299 0.005411   0.007035   0.002165   0.002165
## Balanced Accuracy       0.81978 0.589266   0.899727   0.666395   0.642314
##                      Class: MG Class: MLY Class: MN Class: MPD Class: NAD
## Sensitivity            0.48276   0.437500 0.0588235   0.000000   0.447368
## Specificity            0.97933   0.998362 0.9978154   0.997826   0.992265
## Pos Pred Value         0.43077   0.700000 0.2000000   0.000000   0.548387
## Neg Pred Value         0.98317   0.995103 0.9913185   0.995662   0.988442
## Prevalence             0.03139   0.008658 0.0091991   0.004329   0.020563
## Detection Rate         0.01515   0.003788 0.0005411   0.000000   0.009199
## Detection Prevalence   0.03517   0.005411 0.0027056   0.002165   0.016775
## Balanced Accuracy      0.73104   0.717931 0.5283195   0.498913   0.719817
##                      Class: NAG Class: NAP Class: NDP Class: NI Class: PEG
## Sensitivity             0.73729   0.562500   0.153846 0.1428571   0.064516
## Specificity             0.98150   0.998362   0.997820 0.9989136   0.993396
## Pos Pred Value          0.73109   0.750000   0.333333 0.3333333   0.142857
## Neg Pred Value          0.98207   0.996187   0.994028 0.9967480   0.984188
## Prevalence              0.06385   0.008658   0.007035 0.0037879   0.016775
## Detection Rate          0.04708   0.004870   0.001082 0.0005411   0.001082
## Detection Prevalence    0.06439   0.006494   0.003247 0.0016234   0.007576
## Balanced Accuracy       0.85940   0.780431   0.575833 0.5708854   0.528956
##                      Class: PG4 Class: PGE Class: PLC Class: PLP
## Sensitivity            0.142857  0.0000000   1.000000   0.166667
## Specificity            0.996169  0.9994565   1.000000   0.996732
## Pos Pred Value         0.300000  0.0000000   1.000000   0.250000
## Neg Pred Value         0.990207  0.9956687   1.000000   0.994565
## Prevalence             0.011364  0.0043290   0.004329   0.006494
## Detection Rate         0.001623  0.0000000   0.004329   0.001082
## Detection Prevalence   0.005411  0.0005411   0.004329   0.004329
## Balanced Accuracy      0.569513  0.4997283   1.000000   0.581699
##                      Class: PO4 Class: SO4 Class: ZN
## Sensitivity            0.200000     0.7823   0.67368
## Specificity            0.988877     0.9086   0.98460
## Pos Pred Value         0.333333     0.6183   0.70330
## Neg Pred Value         0.977998     0.9566   0.98236
## Prevalence             0.027056     0.1591   0.05141
## Detection Rate         0.005411     0.1245   0.03463
## Detection Prevalence   0.016234     0.2013   0.04924
## Balanced Accuracy      0.594438     0.8455   0.82914